| goatools_go_enrichment |
1 |
- results/tables/go_terms/model_X.genes-mostsigtrans.diffexp.go_term_enrichment.gene_fdr_0-05.go_term_fdr_0-05.tsv
- results/plots/go_terms/model_X.genes-mostsigtrans.diffexp.go_term_enrichment_BP.gene_fdr_0-05.go_term_fdr_0-05.pdf
- results/plots/go_terms/model_X.genes-mostsigtrans.diffexp.go_term_enrichment_CC.gene_fdr_0-05.go_term_fdr_0-05.pdf
- results/plots/go_terms/model_X.genes-mostsigtrans.diffexp.go_term_enrichment_MF.gene_fdr_0-05.go_term_fdr_0-05.pdf
|
|
- goatools =0.9.7
- python =3.7
- pandas =0.25
- matplotlib =3.1
- pillow =6.2
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93 | import sys
sys.stderr = open(snakemake.log[0], "w")
import pandas as pd
import matplotlib.pyplot as plt
from goatools.obo_parser import GODag
from goatools.anno.idtogos_reader import IdToGosReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.godag_plot import plot_results#, plot_goid2goobj, plot_gos
# read in directed acyclic graph of GO terms / IDs
obodag = GODag(snakemake.input.obo)
# read in mapping gene ids from input to GO terms / IDs
objanno = IdToGosReader(snakemake.input.ens_gene_to_go, godag = obodag)
# extract namespace(?) -> id2gos mapping
ns2assoc = objanno.get_ns2assc()
for nspc, id2gos in ns2assoc.items():
print("{NS} {N:,} annotated genes".format(NS=nspc, N=len(id2gos)))
# read gene diffexp table
all_genes = pd.read_table(snakemake.input.diffexp)
# select genes significantly differentially expressed according to BH FDR of sleuth
fdr_level_gene = float(snakemake.params.gene_fdr)
sig_genes = all_genes[all_genes['qval']<fdr_level_gene]
# initialize GOEA object
fdr_level_go_term = float(snakemake.params.go_term_fdr)
goeaobj = GOEnrichmentStudyNS(
# list of 'population' of genes looked at in total
pop = all_genes['ens_gene'].tolist(),
# geneid -> GO ID mapping
ns2assoc = ns2assoc,
# ontology DAG
godag = obodag,
propagate_counts = False,
# multiple testing correction method (fdr_bh is false discovery rate control with Benjamini-Hochberg)
methods = ['fdr_bh'],
# significance cutoff for method named above
alpha = fdr_level_go_term
)
goea_results_all = goeaobj.run_study(sig_genes['ens_gene'].tolist())
# write results to text file
goeaobj.wr_tsv(snakemake.output.enrichment, goea_results_all)
# plot results
ensembl_id_to_symbol = dict(zip(all_genes['ens_gene'], all_genes['ext_gene']))
# from first plot output file name, create generic file name to trigger
# separate plots for each of the gene ontology name spaces
outplot_generic = snakemake.output.plot[0].replace('_BP.','_{NS}.', 1).replace('_CC.','_{NS}.', 1).replace('_MF.', '_{NS}.', 1)
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < fdr_level_go_term]
plot_results(
outplot_generic,
# use pvals for coloring
goea_results=goea_results_sig,
# print general gene symbol instead of Ensembl ID
id2symbol=ensembl_id_to_symbol,
# number of genes to print, or True for printing all (including count of genes)
study_items=True,
# number of genes to print per line
items_p_line=6,
# p-values to use for coloring of GO term nodes (argument name determined from code and testing against value "p_uncorrected")
pval_name="p_fdr_bh"
)
# for all name spaces
for ns in ns2assoc.keys():
# check if no GO terms were found to be significant
if len([r for r in goea_results_sig if r.NS == ns]) == 0:
fig = plt.figure(figsize=(12, 2))
text = fig.text(0.5, 0.5,
"No plot generated, because no GO terms were found significant\n"
"for name space {} and significance levels: genes ({}), GO terms ({}).\n"
"You might want to check those levels and/or your intermediate data.".format(
ns, fdr_level_gene, fdr_level_go_term),
ha='center', va='center', size=20)
fig.savefig( outplot_generic.replace('_{NS}.', "_{}.".format(ns)) )
|
|
| fgsea |
1 |
- results/tables/fgsea/model_X.all-gene-sets.tsv
- results/tables/fgsea/model_X.rank-ties.tsv
- results/tables/fgsea/model_X.sig-gene-sets.tsv
- results/plots/fgsea/model_X.table-plot.pdf
|
|
- r-base =3.6
- r-tidyverse =1.2
- bioconductor-fgsea =1.10
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("fgsea")
# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source( file.path(snakemake@scriptdir, 'common.R') )
pkg <- snakemake@params[["bioc_pkg"]]
load_bioconductor_package(snakemake@input[["species_anno"]], pkg)
gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
drop_na(ext_gene) %>%
mutate(ext_gene = str_to_upper(ext_gene)) %>%
# resolve multiple occurences of the same ext_gene, usually
# meaning that several ENSEMBLE genes map to the same gene
# symbol -- so we may loose resolution here, as long as gene
# symbols are used
group_by(ext_gene) %>%
filter( qval == min(qval, na.rm = TRUE) ) %>%
filter( pval == min(pval, na.rm = TRUE) ) %>%
# for the case of min(qval) == 1 and min(pval) == 1, we
# need something else to select a unique entry per gene
filter( mean_obs == max(mean_obs, na.rm = TRUE) ) %>%
mutate(target_id = str_c(target_id, collapse=",")) %>%
mutate(ens_gene = str_c(ens_gene, collapse=",")) %>%
distinct()
signed_pi <- get_prefix_col("signed_pi_value", colnames(diffexp))
ranked_genes <- diffexp %>%
dplyr::select(ext_gene, !!signed_pi) %>%
deframe()
# get and write out rank values that are tied -- a way to check up on respecitve warnings
rank_ties <- enframe(ranked_genes) %>%
mutate(dup = duplicated(value) | duplicated(value, fromLast = TRUE) ) %>%
filter(dup == TRUE) %>%
dplyr::select(ext_gene = name, !!signed_pi := value)
write_tsv(rank_ties, snakemake@output[["rank_ties"]])
fgsea_res <- fgsea(pathways = gene_sets,
stats = ranked_genes,
minSize=10,
maxSize=700,
nproc=snakemake@threads,
nperm=snakemake@params[["nperm"]]
) %>%
as_tibble()
# Annotation is impossible without any entries, so then just write out empty files
if ( (fgsea_res %>% count() %>% pull(n)) == 0 ) {
write_tsv(fgsea_res, path = snakemake@output[["enrichment"]])
write_tsv(fgsea_res, path = snakemake@output[["significant"]])
} else {
# add further annotation
annotated <- fgsea_res %>%
unnest(leadingEdge) %>%
mutate(
leading_edge_symbol = str_to_title(leadingEdge),
leading_edge_entrez_id = mapIds(leading_edge_symbol, x=get(pkg), keytype="SYMBOL", column="ENTREZID"),
leading_edge_ens_gene = mapIds(leading_edge_symbol, x=get(pkg), keytype="SYMBOL", column="ENSEMBL")
) %>%
group_by(pathway) %>%
summarise(
leadingEdge = str_c(leadingEdge, collapse = ','),
leading_edge_symbol = str_c(leading_edge_symbol, collapse = ','),
leading_edge_entrez_id = str_c(leading_edge_entrez_id, collapse = ','),
leading_edge_ens_gene = str_c(leading_edge_ens_gene, collapse = ',')
) %>%
inner_join(fgsea_res %>% dplyr::select(-leadingEdge), by = "pathway") %>%
dplyr::select(-leadingEdge, -leading_edge_symbol,
-leading_edge_entrez_id, -leading_edge_ens_gene,
leading_edge_symbol, leading_edge_ens_gene,
leading_edge_entrez_id, leadingEdge)
# write out fgsea results for all gene sets
write_tsv(annotated, path = snakemake@output[["enrichment"]])
# select significant pathways
sig_gene_sets <- annotated %>%
filter( padj < snakemake@params[["gene_set_fdr"]] )
# write out fgsea results for gene sets found to be significant
write_tsv(sig_gene_sets, path = snakemake@output[["significant"]])
}
height = .7 * (length(gene_sets) + 2)
# table plot of all gene sets
tg <- plotGseaTable(
pathway = gene_sets,
stats = ranked_genes,
fgseaRes = fgsea_res,
gseaParam = 1,
render = FALSE
)
ggsave(filename = snakemake@output[["plot"]], plot = tg, width = 12, height = height, limitsize=FALSE)
|
|
| fgsea_plot_gene_sets |
1 |
- results/plots/fgsea/model_X
|
|
- r-base =3.6
- r-tidyverse =1.2
- bioconductor-fgsea =1.10
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("fgsea")
# provides library("tidyverse") and function get_prefix_col()
# the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source( file.path(snakemake@scriptdir, 'common.R') )
gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
sig_gene_sets <- read_tsv(snakemake@input[["sig_gene_sets"]])
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
drop_na(ext_gene) %>%
mutate(ext_gene = str_to_upper(ext_gene)) %>%
group_by(ext_gene) %>%
filter( qval == min(qval, na.rm = TRUE) ) %>%
mutate(target_id = str_c(target_id, collapse=",")) %>%
mutate(ens_gene = str_c(ens_gene, collapse=",")) %>%
distinct()
signed_pi <- get_prefix_col("signed_pi_value", colnames(diffexp))
ranked_genes <- diffexp %>%
dplyr::select(ext_gene, !!signed_pi) %>%
deframe()
dir.create( snakemake@output[[1]] )
for ( set in names(gene_sets) ) {
# plot gene set enrichment
p <- plotEnrichment(gene_sets[[set]], ranked_genes) +
ggtitle(str_c("gene set: ", set),
subtitle = str_c(
sig_gene_sets %>% filter(pathway == set) %>% pull(size)," genes; ",
"p-value (BH-adjusted): ", sig_gene_sets %>% filter(pathway == set) %>% pull(padj), "\n",
"normalized enrichment score (NES):", sig_gene_sets %>% filter(pathway == set) %>% pull(NES)
)
) +
xlab("gene rank") +
theme_bw( base_size = 16 )
ggsave(
file = str_c( snakemake@output[[1]], "/", snakemake@wildcards[["model"]], ".", set, ".gene-set-plot.pdf"),
width = 10,
height = 7
)
}
|
|
| spia |
1 |
- results/tables/pathways/model_X.pathways.tsv
- results/plots/pathways/model_X.spia-perturbation-plots.pdf
|
|
- r-base =3.6
- bioconductor-spia =2.38
- bioconductor-graphite =1.32
- r-tidyverse =1.2
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("SPIA")
library("graphite")
# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source( file.path(snakemake@scriptdir, 'common.R') )
pkg <- snakemake@params[["bioc_pkg"]]
load_bioconductor_package(snakemake@input[["species_anno"]], pkg)
pw_db <- snakemake@params[["pathway_db"]]
db <- pathways(snakemake@params[["species"]], pw_db)
db <- convertIdentifiers(db, "ENSEMBL")
options(Ncpus = snakemake@threads)
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
drop_na(ens_gene) %>%
mutate(ens_gene = str_c("ENSEMBL:", ens_gene))
universe <- diffexp %>% pull(var = ens_gene)
sig_genes <- diffexp %>% filter(qval <= 0.05)
# get logFC equivalent (the sum of beta scores of covariates of interest)
beta_col <- get_prefix_col("b", colnames(sig_genes))
beta <- sig_genes %>%
dplyr::select(ens_gene, !!beta_col) %>%
deframe()
t <- tempdir(check=TRUE)
olddir <- getwd()
setwd(t)
prepareSPIA(db, pw_db)
res <- runSPIA(de = beta, all = universe, pw_db, plots = TRUE, verbose = TRUE)
setwd(olddir)
file.copy(file.path(t, "SPIAPerturbationPlots.pdf"), snakemake@output[["plots"]])
write_tsv(res, snakemake@output[["table"]])
|
|
| sleuth_diffexp |
1 |
- results/plots/mean-var/model_X.mean-variance-plot.pdf
- results/plots/volcano/model_X.volcano-plots.pdf
- results/plots/ma/model_X.ma-plots.pdf
- results/plots/qq/model_X.qq-plots.pdf
- results/sleuth/diffexp/model_X.transcripts.diffexp.rds
- results/sleuth/diffexp/model_X.genes-aggregated.diffexp.rds
- results/sleuth/diffexp/model_X.genes-mostsigtrans.diffexp.rds
- results/tables/diffexp/model_X.transcripts.diffexp.tsv
- results/tables/diffexp/model_X.genes-aggregated.diffexp.tsv
- results/tables/diffexp/model_X.genes-mostsigtrans.diffexp.tsv
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
library("tidyverse")
library("fs")
library("grid")
library("gridExtra")
model <- snakemake@params[["model"]]
sleuth_object <- sleuth_load(snakemake@input[[1]])
sleuth_object <- sleuth_fit(sleuth_object, as.formula(model[["full"]]), 'full')
sleuth_object <- sleuth_fit(sleuth_object, as.formula(model[["reduced"]]), 'reduced')
sleuth_object <- sleuth_lrt(sleuth_object, "reduced", "full")
# plot mean-variance
mean_var_plot <- plot_mean_var(sleuth_object,
which_model = "full",
point_alpha = 0.4,
point_size = 2,
point_colors = c("black", "dodgerblue"),
smooth_alpha = 1,
smooth_size = 0.75,
smooth_color = "red")
ggsave(snakemake@output[["mean_var_plot"]], mean_var_plot)
write_results <- function(so, mode, output, output_all) {
so$pval_aggregate <- FALSE
if(mode == "aggregate") {
# workaround the following bug-request:
# https://github.com/pachterlab/sleuth/pull/240
# TODO renaming can be removed when fixed
g_col <- so$gene_column
so$gene_column <- NULL
so$pval_aggregate <- TRUE
so$gene_column <- g_col
}
plot_model <- snakemake@wildcards[["model"]]
# list for qq-plots to make a multipage pdf-file as output
qq_list <- list()
print("Performing likelihood ratio test")
all <- sleuth_results(so, "reduced:full", "lrt", show_all = TRUE, pval_aggregate = so$pval_aggregate) %>%
arrange(pval)
covariates <- colnames(design_matrix(so, "full"))
covariates <- covariates[covariates != "(Intercept)"]
# iterate over all covariates and perform wald test in order to obtain beta estimates
if(!so$pval_aggregate) {
# lists for volcano and ma-plots to make a multipage pdf-file as output
volcano_list <- list()
ma_list <- list()
for(covariate in covariates) {
print(str_c("Performing wald test for ", covariate))
so <- sleuth_wt(so, covariate, "full")
volc_plot_title <- str_c(plot_model, ": volcano plot for ", covariate)
ma_plot_title <- str_c(plot_model, ": ma-plot for ", covariate)
qq_plot_title <- str_c(plot_model, ": qq-plot from wald test for ", covariate)
# volcano plot
print(str_c("Performing volcano plot for ", covariate))
volcano <- plot_volcano(so, covariate, "wt", "full",
sig_level = snakemake@params[["sig_level_volcano"]],
point_alpha = 0.2,
sig_color = "red",
highlight = NULL) +
ggtitle(volc_plot_title) +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
volcano_list[[volc_plot_title]] <- volcano
# ma-plot
print(str_c("Performing ma-plot for ", covariate))
ma_plot <- plot_ma(so, covariate, "wt", "full",
sig_level = snakemake@params[["sig_level_ma"]],
point_alpha = 0.2,
sig_color = "red",
highlight = NULL,
highlight_color = "green") +
ggtitle(ma_plot_title) +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
ma_list[[ma_plot_title]] <- ma_plot
# qq-plots from wald test
print(str_c("Performing qq-plot from wald test for ", covariate))
qq_plot <- plot_qq(so, covariate, "wt", "full",
sig_level = snakemake@params[["sig_level_qq"]],
point_alpha = 0.2,
sig_color = "red",
highlight = NULL,
highlight_color = "green",
line_color = "blue") +
ggtitle(qq_plot_title) +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
qq_list[[qq_plot_title]] <- qq_plot
beta_col_name <- str_c("b", covariate, sep = "_")
beta_se_col_name <- str_c(beta_col_name, "se", sep = "_")
all_wald <- sleuth_results(so, covariate, "wt", show_all = TRUE, pval_aggregate = FALSE) %>%
dplyr::select( target_id = target_id,
!!beta_col_name := b,
!!beta_se_col_name := se_b)
signed_pi_col_name <- str_c("signed_pi_value", covariate, sep = "_")
all <- inner_join(all, all_wald, by = "target_id") %>%
# calculate a signed version of the pi-value from:
# https://dx.doi.org/10.1093/bioinformatics/btr671
# e.g. useful for GSEA ranking
mutate( !!signed_pi_col_name := -log10(pval) * !!sym(beta_col_name) )
}
# saving volcano plots
marrange_volcano <- marrangeGrob(grobs=volcano_list, nrow=1, ncol=1, top = NULL)
ggsave(snakemake@output[["volcano_plots"]], plot = marrange_volcano, width = 14)
# saving ma-plots
marrange_ma <- marrangeGrob(grobs=ma_list, nrow=1, ncol=1, top = NULL)
ggsave(snakemake@output[["ma_plots"]], plot = marrange_ma, width = 14)
}
if(mode == "mostsignificant") {
# for each gene, select the most significant transcript (this is equivalent to sleuth_gene_table, but with bug fixes)
all <- all %>%
drop_na(ens_gene) %>%
group_by(ens_gene) %>%
filter( qval == min(qval, na.rm = TRUE) ) %>%
# ties in qval (e.g. min(qval) == 1) can lead to multiple entries per gene
filter( pval == min(pval, na.rm = TRUE) ) %>%
# for min(qval) == 1, then usually also min(pval) == 1, and
# we need something else to select a unique entry per gene
filter( mean_obs == max(mean_obs, na.rm = TRUE) ) %>%
# sometimes we still get two transcript variants with exactly
# the same values, i.e. they have exactly the same sequence
# but (slightly) different annotations -- then we retain a string
# with a comma-separated list of them
mutate(target_id = str_c(target_id, collapse=",")) %>%
distinct() %>%
# useful sort for scrolling through output by increasing q-values
arrange(qval)
# qq-plot from likelihood ratio test
print(str_c("Performing qq-plot from likelihood ratio test"))
qq_plot_title_trans <- str_c(plot_model, ": qq-plot from likelihood ratio test")
qq_plot_trans <- plot_qq(so,
test = 'reduced:full',
test_type = 'lrt',
sig_level = snakemake@params[["sig_level_qq"]],
point_alpha = 0.2,
sig_color = "red",
highlight = NULL,
highlight_color = "green",
line_color = "blue") +
ggtitle(qq_plot_title_trans) +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
qq_list[[qq_plot_title_trans]] <- qq_plot_trans
}
# saving qq-plots
marrange_qq <- marrangeGrob(grobs=qq_list, nrow=1, ncol=1, top = NULL)
ggsave(snakemake@output[["qq_plots"]], plot = marrange_qq, width = 14)
write_tsv(all, path = output, quote_escape = "none")
write_rds(all, path = output_all, compress = "none")
}
write_results(sleuth_object, "transcripts", snakemake@output[["transcripts"]], snakemake@output[["transcripts_rds"]])
write_results(sleuth_object, "aggregate", snakemake@output[["genes_aggregated"]], snakemake@output[["genes_aggregated_rds"]])
write_results(sleuth_object, "mostsignificant", snakemake@output[["genes_mostsigtrans"]], snakemake@output[["genes_mostsigtrans_rds"]])
|
|
| plot_diffexp_heatmap |
1 |
- results/plots/diffexp-heatmap/model_X.diffexp-heatmap.pdf
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
so <- sleuth_load(snakemake@input[["so"]])
diffexp <- read.table(snakemake@input[["diffexp"]], sep = "\t", header = TRUE)
pdf(file = snakemake@output[[1]], width = 14)
plot_transcript_heatmap(so, transcripts = diffexp$target_id[1:20], main="Top 20 differentially expressed transcripts (TPM)")
# TODO, once pachterlab/sleuth#214 is merged, add this to get gene names
# , labels_row = diffexp$ext_gene[1:20])
dev.off()
|
|
| tpm_matrix |
1 |
- results/tables/tpm-matrix/model_X.tpm-matrix.tsv
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
so <- sleuth_load(snakemake@input[[1]])
tpm <- sleuth_to_matrix(so, "obs_norm", "tpm")
target_mapping <- so$target_mapping
rownames(target_mapping) <- target_mapping$target_id
tpm <- cbind(ext_gene = target_mapping[rownames(tpm), "ext_gene"], tpm)
write.table(tpm, file = snakemake@output[[1]], col.names = NA, row.names = TRUE)
|
|
| ihw_fdr_control |
3 |
- results/tables/ihw/model_X.transcripts.ihw-results.tsv
- results/plots/ihw/transcripts/model_X.transcripts.plot-dispersion.pdf
- results/plots/ihw/transcripts/model_X.transcripts.plot-histograms.pdf
- results/plots/ihw/transcripts/model_X.transcripts.plot-trends.pdf
- results/plots/ihw/transcripts/model_X.transcripts.plot-decision.pdf
- results/plots/ihw/transcripts/model_X.transcripts.plot-adj-pvals.pdf
- results/tables/ihw/model_X.genes-aggregated.ihw-results.tsv
- results/plots/ihw/genes-aggregated/model_X.genes-aggregated.plot-dispersion.pdf
- results/plots/ihw/genes-aggregated/model_X.genes-aggregated.plot-histograms.pdf
- results/plots/ihw/genes-aggregated/model_X.genes-aggregated.plot-trends.pdf
- results/plots/ihw/genes-aggregated/model_X.genes-aggregated.plot-decision.pdf
- results/plots/ihw/genes-aggregated/model_X.genes-aggregated.plot-adj-pvals.pdf
- results/tables/ihw/model_X.genes-mostsigtrans.ihw-results.tsv
- results/plots/ihw/genes-mostsigtrans/model_X.genes-mostsigtrans.plot-dispersion.pdf
- results/plots/ihw/genes-mostsigtrans/model_X.genes-mostsigtrans.plot-histograms.pdf
- results/plots/ihw/genes-mostsigtrans/model_X.genes-mostsigtrans.plot-trends.pdf
- results/plots/ihw/genes-mostsigtrans/model_X.genes-mostsigtrans.plot-decision.pdf
- results/plots/ihw/genes-mostsigtrans/model_X.genes-mostsigtrans.plot-adj-pvals.pdf
|
|
- bioconductor-ihw =1.14.0
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("tidyverse")
library("ggpubr")
library("IHW")
number_of_groups <- 7
gene_data <- read_tsv(snakemake@input[[1]]) %>%
drop_na()
level_name <- snakemake@wildcards[["level"]]
# calculate covariate mean_obs for gene.aggregated data
if(level_name == "genes-aggregated") {
gene_data <- gene_data %>%
mutate(mean_obs = if_else(num_aggregated_transcripts > 0, sum_mean_obs_counts / num_aggregated_transcripts, 0)) %>%
rename(ens_gene = target_id)
}
# determine the appropriate number of groups for grouping in ihw calculation
tested_number_of_groups <- number_of_groups
ihw_test_grouping <- function(x){
tryCatch(
expr = {
ihw(pval ~ mean_obs, data = gene_data, alpha = 0.1, nbins = x)
return(TRUE)
},
error = function(e) {
print(str_c("Number of groups was set too hight, trying grouping by ", tested_number_of_groups - 1, " groups."))
return(FALSE)
})
}
while(!ihw_test_grouping(tested_number_of_groups) && tested_number_of_groups > 0){
tested_number_of_groups <- tested_number_of_groups -1
ihw_test_grouping(tested_number_of_groups)
}
if (tested_number_of_groups < number_of_groups) {
print(str_c("The chosen number of groups for ", level_name, " dataset was too large, instead IHW was calculated on ", tested_number_of_groups, " groups."))
number_of_groups <- tested_number_of_groups
}
gene_data <- gene_data %>%
select(ens_gene, ext_gene, pval, mean_obs) %>%
mutate(grouping = groups_by_filter(mean_obs, number_of_groups))
### diagnostic plots for covariate and grouping
#dispersion plot
dispersion <- ggplot(gene_data, aes(x = percent_rank(mean_obs), y = -log10(pval))) +
geom_point() +
ggtitle("IHW: scatter plot of p-values vs. mean of counts") +
theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5)) +
xlab("percent rank of mean_obs") +
ylab(expression(-log[10](p-value)))
ggsave(snakemake@output[["dispersion"]], dispersion)
#histograms
hist_overall <- ggplot(gene_data, aes(x = pval)) +
geom_histogram(binwidth = 0.025, boundary = 0) +
xlab("p-values without grouping") +
ylab("density")
levels(gene_data$grouping) <- paste0(rep("group ", number_of_groups), 1:number_of_groups)
hist_groups <- ggplot(gene_data, aes(x = pval)) +
geom_histogram(binwidth = 0.025, boundary = 0) +
xlab("p-values of the individual groups") +
ylab("density") +
facet_wrap(~ grouping, nrow = ceiling(number_of_groups / 4))
plots_agg_mean_obs <- ggarrange(hist_overall, hist_groups, nrow = 1)
histograms <- annotate_figure(plots_agg_mean_obs,
top = text_grob("IHW: histograms for p-values of mean of counts",
color = "black",
face = "bold",
size = 12))
ggexport(histograms,
filename = snakemake@output[["histograms"]],
width=14)
# ihw calculation
ihw_results_mean <- ihw(pval ~ mean_obs, data = gene_data, alpha = 0.1, nbins = tested_number_of_groups)
ihw_mean <- as.data.frame(ihw_results_mean)
# merging ens_gene-IDs and ext_gene-names
if(all_equal(ihw_mean$pvalue, gene_data$pval) && all_equal(ihw_mean$covariate, gene_data$mean_obs)){
ihw_mean <- gene_data %>%
select(ens_gene, ext_gene) %>%
bind_cols(ihw_mean)
}
write_tsv(ihw_mean, snakemake@output[["results"]])
### diagnostic plots for ihw calculation
# plot of trends of the covariate
plot_trend_mean <- plot(ihw_results_mean) +
ggtitle("IHW: Plot of trends of mean of counts") +
theme(plot.title = element_text(size=12))
ggsave(snakemake@output[["trends"]], plot_trend_mean)
# plots of decision boundaries for unweighted p-values as a function of the covariate
plot_decision_mean <- plot(ihw_results_mean, what = "decisionboundary") +
ggtitle("IHW: Decision boundaries for unweighted p-values vs. mean of counts") +
theme(plot.title = element_text(size=12))
ggsave(snakemake@output[["decision"]], plot_decision_mean)
# p-values vs. adusted p-values
plot_ihw_pval_mean <- ggplot(ihw_mean, aes(x = pvalue, y = adj_pvalue, col = group)) +
geom_point(size = 0.25) +
ggtitle("IHW: raw p-values vs. adusted p-values from ihw-analysis") +
theme(plot.title = element_text(size=12)) +
scale_colour_hue(l = 70, c = 150, drop = FALSE)
ggsave(snakemake@output[["adj_pvals"]], plot_ihw_pval_mean)
|
|
| plot_diffexp_pval_hist |
3 |
- results/plots/diffexp/model_X.transcripts.diffexp-pval-hist.pdf
- results/plots/diffexp/model_X.genes-aggregated.diffexp-pval-hist.pdf
- results/plots/diffexp/model_X.genes-mostsigtrans.diffexp-pval-hist.pdf
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
library("ggplot2")
library("tidyr")
diffexp <- sleuth_load(snakemake@input[["diffexp_rds"]]) %>% drop_na(pval)
ggplot(diffexp) + geom_histogram(aes(pval), bins = 100)
ggsave(file = snakemake@output[[1]], width = 14)
|
|
| plot_pca |
2 |
- results/plots/pca/condition.pca.pdf
- results/plots/pc-variance/condition.pc-variance-plot.pdf
- results/plots/loadings/condition.loadings-plot.pdf
- results/plots/pca/batch_effect.pca.pdf
- results/plots/pc-variance/batch_effect.pc-variance-plot.pdf
- results/plots/loadings/batch_effect.loadings-plot.pdf
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
library("ggpubr")
so <- sleuth_load(snakemake@input[[1]])
#principal components
pc <- 4
# plot pca
so <- sleuth_load(snakemake@input[[1]])
pc12 <- plot_pca(so, color_by = snakemake@wildcards[["covariate"]], units = "est_counts", text_labels = TRUE)
pc34 <- plot_pca(so, pc_x = 3, pc_y = 4, color_by = snakemake@wildcards[["covariate"]], units = "est_counts", text_labels = TRUE)
ggarrange(pc12, pc34, ncol = 2, common.legend = TRUE)
ggsave(snakemake@output[["pca"]], width=14)
# plot pc variance
plot_pc_variance(so, use_filtered = TRUE, units = "est_counts", pca_number = pc)
ggsave(snakemake@output[["pc_var"]], width=14)
# plot loadings
pc_loading_plots <- list()
for(i in 1:pc) {
pc_loading_plots[[paste0("pc", i, "_loading")]] <- plot_loadings(so,
scale = TRUE,
pc_input = i,
pc_count = 10,
units = "est_counts") +
ggtitle(paste0(snakemake@wildcards[["covariate"]], ": plot loadings for principal component ", i)) +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
}
plots_loading <- ggarrange(plotlist = pc_loading_plots, ncol = 1, nrow = 1, common.legend = TRUE)
ggexport(plots_loading, filename = snakemake@output[["loadings"]], width = 14)
|
|
| plot_group_density |
1 |
- results/plots/group_density/model_X.group_density.pdf
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
library("tidyverse")
so <- sleuth_load(snakemake@input[[1]])
plot_group_density(so,
use_filtered = TRUE,
units = "est_counts",
trans = "log",
grouping = setdiff(colnames(so$sample_to_covariates), "sample"),
offset = 1)
ggsave(snakemake@output[[1]])
|
|
| plot_scatter |
1 |
- results/plots/scatter/model_X.scatter.pdf
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
library("tidyverse")
library("ggpubr")
so <- sleuth_load(snakemake@input[[1]])
plot_list <- list()
# combinations between samples without repetition
# sample_matrix <- t(combn(so$sample_to_covariates$sample, 2))
# combinations <- as.numeric(row(sample_matrix))
#
# for(i in combinations) {
# sample1 <- sample_matrix[i, 1]
# sample2 <- sample_matrix[i, 2]
#
# plot_list[[i]] <- plot_scatter(so,
# sample_x = sample1,
# sample_y = sample2,
# use_filtered = TRUE,
# units = "est_counts",
# point_alpha = 0.2,
# xy_line = TRUE,
# xy_line_color = "red") +
# labs(x = sample1, y = sample2)
# }
# all combinations between samples (with repitition)
samples <- so$sample_to_covariates$sample
sample_matrix <- crossing(x = samples, y = samples) # creates all combinations between samples
combinations <- as.numeric(row(sample_matrix))
for(i in combinations) {
sample1 <- sample_matrix[i, 1]
sample2 <- sample_matrix[i, 2]
if(sample1 != sample2) {
plot_list[[i]] <- plot_scatter(so,
sample_x = sample1,
sample_y = sample2,
use_filtered = TRUE,
units = "est_counts",
point_alpha = 0.2,
xy_line = TRUE,
xy_line_color = "red") +
labs(x = sample1, y = sample2)
} else {
x_val <- y_val <- c(0,0)
test <- tibble(x_val, y_val)
plot_list[[i]] <- ggplot(test, aes(x = x_val, y = y_val)) +
theme_void() +
geom_text(label = "") +
annotate("text", label = sample1, x=0, y=0, colour = "blue", fontface = "bold")
}
}
all_plots <- ggarrange(plotlist = plot_list)
plot_figure <- annotate_figure(all_plots,
top = text_grob("log transformed scatter plots of different samples",
color = "black",
face = "bold",
size = 14))
ggsave(snakemake@output[[1]], plot_figure)
|
|
| plot_bootstrap |
1 |
- results/plots/bootstrap/model_X
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("tidyverse")
library("sleuth")
so <- sleuth_load(snakemake@input[["so"]])
top_n <- -strtoi(snakemake@params["top_n"])
results <- read_tsv(snakemake@input[["transcripts"]])
top_genes <- results %>%
filter(qval <= snakemake@params[["fdr"]]) %>%
top_n(top_n, qval) %>%
dplyr::select(ext_gene) %>%
drop_na() %>%
distinct(ext_gene)
genes_of_interest <- tibble( ext_gene = snakemake@params[["genes"]]) %>%
distinct(ext_gene)
genes <- full_join(top_genes, genes_of_interest, by = 'ext_gene') %>%
add_row(ext_gene = "Custom") %>%
pull(ext_gene)
dir.create( snakemake@output[[1]] )
for (gene in genes) {
transcripts <- results %>%
filter(ext_gene == gene) %>%
drop_na() %>%
pull(target_id)
if ( length( transcripts > 0 ) ) {
for (transcript in transcripts) {
plot_bootstrap(so, transcript, color_by = snakemake@params[["color_by"]], units = "tpm")
ggsave(file = str_c(snakemake@output[[1]], "/", gene, ".", transcript, ".", snakemake@wildcards[["model"]] , ".bootstrap.pdf"))
}
}
}
|
|
| plot_fragment_length_dist |
5 |
- results/plots/fld/A-1.fragment-length-dist.pdf
- results/plots/fld/B-1.fragment-length-dist.pdf
- results/plots/fld/B-2.fragment-length-dist.pdf
- results/plots/fld/C-1.fragment-length-dist.pdf
- results/plots/fld/D-1.fragment-length-dist.pdf
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
| log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
so <- sleuth_load(snakemake@input[[1]])
pdf(file = snakemake@output[[1]])
plot_fld(so, paste0(snakemake@wildcards[["sample"]], "-", snakemake@wildcards[["unit"]]))
dev.off()
|
|
| compose_sample_sheet |
1 |
- results/sleuth/samples.tsv
|
|
|
|
| sleuth_init |
2 |
- results/sleuth/model_X.rds
- results/sleuth/all.rds
|
|
- r-sleuth =0.30
- bioconductor-rhdf5lib =1.6.0
- bioconductor-rhdf5 =2.28.0
- bioconductor-biomart =2.42.0
- r-gridextra =2.3
- r-pheatmap =1.0.12
- r-tidyverse =1.2.1
- r-ggpubr =0.2.4
- r-base =3.6
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 | log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
library("sleuth")
library("biomaRt")
library("tidyverse")
model <- snakemake@params[["model"]]
mart <- biomaRt::useMart(
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
host = 'ensembl.org'
)
t2g <- biomaRt::getBM(
attributes = c( "ensembl_transcript_id",
"ensembl_gene_id",
"external_gene_name"),
mart = mart
) %>%
rename( target_id = ensembl_transcript_id,
ens_gene = ensembl_gene_id,
ext_gene = external_gene_name
)
samples <- read_tsv(snakemake@input[["samples"]], na = "", col_names = TRUE) %>%
# make everything except the index, sample name and path string a factor
mutate_at( vars(-X1, -sample, -path),
list(~factor(.))
)
if(!is.null(snakemake@params[["exclude"]])) {
samples <- samples %>%
filter( !sample %in% snakemake@params[["exclude"]] )
}
if(!is.null(model)) {
# retrieve the model formula
formula <- as.formula(model)
# extract variables from the formula and unnest any nested variables
variables <- labels(terms(formula)) %>%
strsplit('[:*]') %>%
unlist()
# remove samples with an NA value in any of the columns
# relevant for sleuth under the current model
samples <- samples %>%
drop_na(
c( sample, path, all_of(variables) )
)
}
so <- sleuth_prep( samples,
extra_bootstrap_summary = TRUE,
target_mapping = t2g,
aggregation_column = "ens_gene",
read_bootstrap_tpm = TRUE,
transform_fun_counts = function(x) log2(x + 0.5),
num_cores = snakemake@threads
)
custom_transcripts <- so$obs_raw %>%
# find transcripts not in the target_mapping
filter(!target_id %in% so$target_mapping$target_id) %>%
# make it a succinct list with no repititions
distinct(target_id) %>%
# pull it out into a vector
pull(target_id)
if(!length(custom_transcripts) == 0) {
so$target_mapping <- so$target_mapping %>%
# add those custom transcripts as rows to the target mapping
add_row(ens_gene = NA, ext_gene = "Custom", target_id = custom_transcripts)
}
sleuth_save(so, snakemake@output[[1]])
|
|
| kallisto_quant |
5 |
- results/kallisto/A-1
- results/kallisto/B-1
- results/kallisto/B-2
- results/kallisto/C-1
- results/kallisto/D-1
|
|
- kallisto =0.45
- hdf5 =1.10
|
| kallisto quant -i {input.idx} -o {output} {params.extra} {input.fq} 2> {log}
|
|
| cutadapt_pe |
4 |
- results/trimmed/A-1.1.fastq.gz
- results/trimmed/A-1.2.fastq.gz
- results/trimmed/A-1.qc.txt
- results/trimmed/B-1.1.fastq.gz
- results/trimmed/B-1.2.fastq.gz
- results/trimmed/B-1.qc.txt
- results/trimmed/C-1.1.fastq.gz
- results/trimmed/C-1.2.fastq.gz
- results/trimmed/C-1.qc.txt
- results/trimmed/D-1.1.fastq.gz
- results/trimmed/D-1.2.fastq.gz
- results/trimmed/D-1.qc.txt
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 | """Snakemake wrapper for trimming paired-end reads using cutadapt."""
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"
from snakemake.shell import shell
n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
shell(
"cutadapt"
" {snakemake.params}"
" -o {snakemake.output.fastq1}"
" -p {snakemake.output.fastq2}"
" {snakemake.input}"
" > {snakemake.output.qc} {log}")
|
|
| kallisto_index |
1 |
- results/kallisto/transcripts.idx
|
|
- kallisto =0.45
- hdf5 =1.10
|
| kallisto index -i {output} {input} 2> {log}
|
|
| cutadapt |
1 |
- results/trimmed/B-2.fastq.gz
- results/trimmed/B-2.qc.txt
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | """Snakemake wrapper for trimming paired-end reads using cutadapt."""
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"
from snakemake.shell import shell
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
shell(
"cutadapt"
" {snakemake.params}"
" -o {snakemake.output.fastq}"
" {snakemake.input[0]}"
" > {snakemake.output.qc} {log}")
|
|